# =============================================================================
# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
# SPDX-License-Identifier: GPL-3.0-or-later
# =============================================================================
# SCRIPT  : MAPLE_haplo_fraction.sh
# PROJECT : MAPLE (Methylation-Anchor Probe for Low Enrichment)
# PURPOSE : Quantify on-target and haplotype-specific fragment counts
#
# OVERVIEW:
#   This script processes stitched fragment BED files generated by MAPLE-Stitch,
#   identifies fragments matching predefined haplotype methylation patterns,
#   and calculates per-locus coverage and haplotype frequencies.
#
# INPUTS  :
#   - <sample>.ontarget.stitch.bed : On-target stitched fragments
#   - <pattern>.txt                : Haplotype methylation pattern definition table
#
# OUTPUTS :
#   - <sample>.on_target.coverage  : Per-locus coverage of on-target fragments
#   - <sample>.haplo.coverage      : Per-locus haplotype-specific fragment counts
#   - <sample>_tmp/                : Intermediate files for quality control
#
# USAGE   :
#   sbatch MAPLE_haplo_fraction.sh <sample_prefix> <stitch_pattern_file>
#
# AUTHOR  : Yangjunyi Li  (liyangjunyi@bytedance.com)
# CREATED : 2023-08-01
# UPDATED : 2025-09-10
#
# NOTE    :
#   - Designed for use after MAPLE fragment stitching (MAPLE_stitch.py).
#   - Uses bedtools and awk for pattern-based fragment extraction.
#   - Final haplotype ratios are computed using MAPLE_cal_haplo_fraction.py.
# =============================================================================


set -euo pipefail

# ----------------------- Help Function ------------------------
show_help() {
    echo "
Usage:
    bash MAPLE_ngs.sh -r R1.fastq.gz -l R2.fastq.gz -f Bismark_Ref_Folder -g Reference.fasta -o OutputPrefix

Options:
    -r     Path to Read 1 FASTQ file (.fastq.gz)
    -l     Path to Read 2 FASTQ file (.fastq.gz)
    -f     Bismark genome folder (must be prepared with bismark_genome_preparation)
    -g     Reference FASTA (for MAPLE stitching)
    -o     Output prefix
    -h     Show this help message
"
}

# ---------------------- Parse Arguments -----------------------
read1=""
read2=""
bismark_ref=""
ref_fasta=""
output=""
software_path=""
while getopts "r:l:f:g:o:h" opt; do
    case $opt in
        r) read1=$OPTARG ;;
        l) read2=$OPTARG ;;
        f) bismark_ref=$OPTARG ;;
        g) ref_fasta=$OPTARG ;;
        o) output=$OPTARG ;;
        h) show_help; exit 0 ;;
        \?) echo "[ERROR] Invalid option: -$OPTARG"; show_help; exit 1 ;;
    esac
done

# ---------------------- Input Validation ----------------------
if [[ -z "$read1" || -z "$read2" || -z "$bismark_ref" || -z "$ref_fasta" || -z "$output" ]]; then
    echo "[ERROR] Missing required arguments."
    show_help
    exit 1
fi

for file in "$read1" "$read2" "$ref_fasta"; do
    [[ ! -f "$file" ]] && echo "[ERROR] File not found: $file" && exit 1
done

[[ ! -d "$bismark_ref" ]] && echo "[ERROR] Bismark reference folder not found: $bismark_ref" && exit 1

# ---------------------- Prepare Directories -------------------
mkdir -p "$output"/{fastp,align,stitch,haplotype,script}
cd "$output"

# ---------------------- Step 1: fastp --------------------------
echo "[STEP 1] Running fastp for adaptor trimming..."
cd ${software_path}/fastp/
fastp \
    --in1 "$read1" \
    --in2 "$read2" \
    --out1 "${output}_R1.fastq.gz" \
    --out2 "${output}_R2.fastq.gz" \
    --json "${output}.fastp.json" \
    --html "${output}.fastp.html" \
    --failed_out "${output}_failed.out" \
    --unpaired1 "${output}_unpaired_r1.fastq.gz" \
    --unpaired2 "${output}_unpaired_r2.fastq.gz" \
    --thread 8
cd ../

# ---------------------- Step 2: Alignment ----------------------
echo "[STEP 2] Aligning reads with Bismark..."
cd align/
${software_path}/bismark "$bismark_ref" \
    -1 ../fastp/${output}_R1.fastq.gz \
    -2 ../fastp/${output}_R2.fastq.gz

# Sort and index BAM
echo "[STEP 3] Sorting and indexing BAM..."
bam_in="${output}_R1_bismark_bt2_pe.bam"
bam_sorted="${output}.bismark.sorted.bam"
${software_path}/samtools sort -@ 4 -m 8G "$bam_in" > "$bam_sorted"
${software_path}/samtools index "$bam_sorted"
rm "$bam_in"
cd ../

# ---------------------- Step 4: Stitching ----------------------
echo "[STEP 4] Running MAPLE stitch..."
cd stitch/
${software_path}/MAPLE_stitch \
    --bam ../align/"$bam_sorted" \
    --output "./${output}.stitch" \
    --reference "$ref_fasta"
cd ../

echo "[DONE] MAPLE NGS pipeline finished."
